library(data.table)
library(dplyr)
library(tidyr)
library(readxl)
library(biomaRt)
library(forcats)
library(qqman)
library(lmtest)
library(kableExtra)
library(Biobase)
cases <- read_excel("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Case_RSV_Severe.xlsx")
controls <- read_excel("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Control_RSV_mild.xlsx")
other <- read_excel("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Case_RSV_Severe_Trios.xlsx")
I use a simple binary coding for the two phenotypes, 0 for normal outcome and 1 for severe
cases <- cases %>% drop_na(`Genotype number`)
controls <- controls %>% drop_na(`Genotype number`)
phenotype1 <- data.frame("Phenotype"=matrix(2, nrow=nrow(cases), ncol=1))
phenotype2 <- data.frame("Phenotype"=matrix(1, nrow=nrow(controls), ncol=1))
Cases <- cbind(cases, phenotype1)
Controls <- cbind(controls, phenotype2)
Full_Data <- rbind(Cases, Controls)
Full_Data[is.na(Full_Data)] <- "00"
Full_Data$Gender <- ifelse(Full_Data$Gender=="Male",1,2)
Full_Data$`Smoking exposure` <- ifelse(Full_Data$`Smoking exposure`=="No",0,1)
Full_Data$`Pet exposure` <- ifelse(Full_Data$`Pet exposure`=="No",0,1)
Full_Data$`Associated infection` <- ifelse(Full_Data$`Associated infection`=="No",0,1)
write.table(Full_Data,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Full_Data.txt",
sep="\t",
quote=T)
head(Full_Data)
## Genotype number Age in Months Race Gender
## 1 14000 10.4147844314575 Black or African American 1
## 2 14016 3.6665999999999999 White 1
## 3 14019 6.4333 White 2
## 4 14027 0.63333300000000003 White 2
## 5 14041 1.6427104473114 White 2
## 6 14044 1.2156057357788099 Hispanic 2
## Smoking exposure Pet exposure Current weight Associated infection rs1059046
## 1 0 0 8 0 AA
## 2 0 0 6 0 CC
## 3 0 0 7 0 CC
## 4 0 0 2.57 0 AA
## 5 0 0 2.2 0 CC
## 6 0 0 3 1 AA
## rs17886395 rs1965707 rs1965708 rs1059047 rs1136450 rs1136451
## 1 GG CT AC TT CG AA
## 2 CC CT CC CT CC GG
## 3 CC CC CC CC CC GG
## 4 GG CT CC TT GG AA
## 5 CC CT CC CT CC GG
## 6 GG CC CC TT GG AA
## rs1059057/rs141764116 rs4253527 rs1130866 rs4715 rs1124 rs721917 rs2243639
## 1 AA CC 00 00 00 CC GG
## 2 AG CT 00 00 00 CC GG
## 3 GG CC 00 00 00 CC GG
## 4 AA CC 00 00 00 TT AA
## 5 AG CT 00 00 00 TC AG
## 6 AA CC 00 00 00 TC AG
## rs2077079 rs3024798 SP-A2 Genotype SP-A1 Genotype Phenotype
## 1 00 00 1A0/1A3 6A2/6A3 2
## 2 00 00 1A/1A5 6A/6A4 2
## 3 00 00 1A/1A 6A/6A 2
## 4 00 00 1A0/1A9 6A2/6A2 2
## 5 00 00 1A/1A5 6A/6A4 2
## 6 00 00 1A0/1A0 6A2/6A2 2
Plink is a software run the command shell or terminal meaning that results cannot be output to R, so I have included the command I use for the association along with a summary of the results so that this can be visualized in a single document
plink --bfile RSVData --covar RSVCovariates.txt --logistic
I clean up the ancestry portion of the data and code it by corresponding factor levels (1:Asian, 2:Black/AA, 3:Hispanic, 4:Mix, 5:Other, 6:White)
Full_Data$Race <- as.factor(Full_Data$Race)
Full_Data$Race <- Full_Data$Race %>% fct_collapse(Other=c("Other", "Other/Unknown"),
"White"=c("white", "White"))
## I tried using dummy variable by first encoding them myself into a covariate file and then using the plink call below
## plink --bfile RSVData --covar RSVCovariates.txt --logistic
Full_Data$Asian <- ifelse(Full_Data$Race=="Asian", 1, 0)
Full_Data$Black <- ifelse(Full_Data$Race=="Black or African American", 1, 0)
Full_Data$Hispanic <- ifelse(Full_Data$Race=="Hispanic", 1, 0)
Full_Data$Mix <- ifelse(Full_Data$Race=="Mix", 1, 0)
Full_Data$Other <- ifelse(Full_Data$Race=="Other", 1, 0)
I use the biomaRt package to get the chromosome and position for each of the rsIDs which are then passed along with the IDs themselves to MAP file for PLINK associations
mart = useMart("ENSEMBL_MART_SNP", dataset="hsapiens_snp")
snp_ids = colnames(Full_Data)[9:24]
one <- strsplit(snp_ids[8], "/")[[1]][1]
two <- strsplit(snp_ids[8], "/")[[1]][2]
snp_ids2 <- c(snp_ids, one, two)
snp_attributes = c("refsnp_id", "chr_name", "chrom_start")
snp_locations = getBM(attributes=snp_attributes,
filters="snp_filter",
values=snp_ids2,
mart=mart)
snp_locations
## refsnp_id chr_name chrom_start
## 1 rs1059046 10 79559458
## 2 rs1059047 10 79611881
## 3 rs1059057 10 79613765
## 4 rs1124 8 22164004
## 5 rs1130866 2 85666618
## 6 rs1136450 10 79611973
## 7 rs1136451 10 79612325
## 8 rs17886395 10 79558907
## 9 rs1965707 10 79557536
## 10 rs1965708 10 79557289
## 11 rs2077079 2 85668215
## 12 rs2243639 10 79941966
## 13 rs3024798 2 85667185
## 14 rs4253527 10 79614021
## 15 rs4715 8 22163524
## 16 rs721917 10 79946568
Both a PED and MAP file are required for the PLINK associations, with the PED file including IDs, sex and phenotype, and the MAP file including the chromosome, rsID, genetic distance and position of the snp It should be noted that in the original data the snps which were coded as “rs1059057/rs141764116” when mapped using biomaRt came up as only rs1059057 and as such were reffered to here on out as just that single snp
PEDFile <- data.frame(cbind("FID"=paste0("FAM", sample(seq(1, nrow(Full_Data), by=1), replace=F)),
"IID"= 1,
"PID"=1,
"MID"=1,
"SEX"=Full_Data$Gender,
"Pheno"=Full_Data$Phenotype,
"rs1059046"=unlist(lapply(strsplit(Full_Data$rs1059046,''), paste, collapse = ' ')),
"rs1059047"=unlist(lapply(strsplit(Full_Data$rs1059047,''), paste, collapse = ' ')),
"rs1059057"=unlist(lapply(strsplit(Full_Data$`rs1059057/rs141764116`,''), paste, collapse = ' ')),
"rs1124"=unlist(lapply(strsplit(Full_Data$rs1124,''), paste, collapse = ' ')),
"rs1130866"=unlist(lapply(strsplit(Full_Data$rs1130866,''), paste, collapse = ' ')),
"rs1136450"=unlist(lapply(strsplit(Full_Data$rs1136450,''), paste, collapse = ' ')),
"rs1136451"=unlist(lapply(strsplit(Full_Data$rs1136451,''), paste, collapse = ' ')),
"rs17886395"=unlist(lapply(strsplit(Full_Data$rs17886395,''), paste, collapse = ' ')),
"rs1965707"=unlist(lapply(strsplit(Full_Data$rs1965707,''), paste, collapse = ' ')),
"rs1965708"=unlist(lapply(strsplit(Full_Data$rs1965708,''), paste, collapse = ' ')),
"rs2077079"=unlist(lapply(strsplit(Full_Data$rs2077079,''), paste, collapse = ' ')),
"rs2243639"=unlist(lapply(strsplit(Full_Data$rs2243639,''), paste, collapse = ' ')),
"rs3024798"=unlist(lapply(strsplit(Full_Data$rs3024798,''), paste, collapse = ' ')),
"rs4253527"=unlist(lapply(strsplit(Full_Data$rs4253527,''), paste, collapse = ' ')),
"rs4715"=unlist(lapply(strsplit(Full_Data$rs4715,''), paste, collapse = ' ')),
"rs721917"=unlist(lapply(strsplit(Full_Data$rs721917,''), paste, collapse = ' '))))
MAPFile <- data.frame(cbind("chromosome"=snp_locations$chr_name,
"rsID"=data.frame("rsID"=snp_locations$refsnp_id),
"Genetic Distance"=data.frame("Genetic Distance"=matrix(0, nrow=length(colnames(Full_Data)[9:24]), ncol=1)),
"BP"=snp_locations$chrom_start))
CovarFile <- data.frame(cbind("FID"=PEDFile$FID,
"IID"=PEDFile$IID,
"Sex"=PEDFile$SEX,
"Smoking"=Full_Data$`Smoking exposure`, # Remove this line for original covariates, only run for updated model
"Pets"=Full_Data$`Pet exposure`, # Remove this line for original covariates, only run for updated model
"Infection"=Full_Data$`Associated infection`, # Remove this line for original covariates, only run for updated model
"Ancestry"=Full_Data$Race))
#"Other"=Full_Data$Other,
#"Mix"=Full_Data$Mix,
#"Hispanic"=Full_Data$Hispanic,
#"Black"=Full_Data$Black,
#"Asian"=Full_Data$Asian))
write.table(PEDFile,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVData.ped",
sep="\t",
quote=F,
col.names=F,
row.names=F)
write.table(MAPFile,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVData.map",
sep="\t",
quote=F,
col.names=F,
row.names=F)
write.table(CovarFile,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVCovariates.txt",
sep="\t",
quote=F,
col.names=T,
row.names=F)
Next I tried to use the plink software to automatically create the dummy variables for Ancestry only and then re-read in the covariate file to add in the age covariate
plink --file RSVData --covar RSVCovariates.txt --write-covar --dummy-coding
Covar <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/plink.cov", sep=" ", header=T)
Covar1 <- cbind(Covar, "Age"=as.numeric(Full_Data$`Age in Months`))
Covar2 <- cbind(Covar, "Age"=as.numeric(Full_Data$`Age in Months`), "SexAge"=as.numeric(Full_Data$`Age in Months`)*Covar$Sex)
write.table(Covar1,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVCovariates.txt",
sep="\t",
quote=F,
col.names=T,
row.names=F)
write.table(Covar2,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVCovariates.Interaction.txt",
sep="\t",
quote=F,
col.names=T,
row.names=F)
Finally we convert our file to respective bed, bim and fam files to read into PLINK. I used PLINK2 for the logistic models and PLINK for the association maodel. Additionnally I used both PLINK’s dummy-variable software to create the ancestry covariates as well as making my own. Between these there was no difference in the model results.
plink2 --file RSVData --out RSVData --make-bed
plink --file RSVData --covar RSVCovariates.txt --assoc
plink2 --bfile RSVData --covar RSVCovariates.txt --glm hide-covar
And here are our results for the association:
PLINK v2.00a2.3 64-bit (24 Jan 2020) www.cog-genomics.org/plink/2.0/
(C) 2005-2020 Shaun Purcell, Christopher Chang GNU General Public License v3
Logging to plink2.log.
Options in effect:
--bfile RSVData
--covar RSVCovariates.txt
--glm hide-covar
Start time: Mon Oct 18 06:45:05 2021
8192 MiB RAM detected; reserving 4096 MiB for main workspace.
Using up to 8 compute threads.
402 samples (186 females, 216 males; 0 founders) loaded from RSVData.fam.
16 variants loaded from RSVData.bim.
1 binary phenotype loaded (231 cases, 171 controls).
20 covariates loaded from RSVCovariates.txt.
Warning: --glm remaining control count is less than 10x predictor count for
phenotype 'PHENO1'.
Warning: Not including covariate 'Ancestry_4' in --glm regression on phenotype
'PHENO1'.
--glm logistic regression on phenotype 'PHENO1': done.
Results written to plink2.PHENO1.glm.logistic .
You will notice above in the line marked (HERE), we are given a warning concerning one of our Ancestry dummy variables, this is due to PLINK2’s automatic checking of multicolinearity, scale check and relevance. If a covariate does not pass these checks it is excluded from the analysis.
According to PLINK’s documentation: “the p-values for each line reflect the effect of the entity under the TEST column, not of the SNP whilst controlling for that particular covariate”. So we need to look at the TEST labeled ADD for the additive model containing the covariates, this P-Value will give us the significance of the SNP.
results <-fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/plink2.PHENO1.glm.logistic",
sep="\t",
header=T)
colnames(results)[colnames(results) == "#CHROM"] <- "CHR"
colnames(results)[colnames(results) == "POS"] <- "BP"
colnames(results)[colnames(results) == "ID"] <- "SNP"
results <- transform(results, CHR = as.numeric(CHR))
results <- transform(results, P = as.numeric(P))
results
## CHR BP SNP REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## 1: 2 85666618 rs1130866 T C C ADD 254 0.942286 0.206121
## 2: 2 85667185 rs3024798 C A A ADD 128 0.581077 0.302927
## 3: 2 85668215 rs2077079 A C C ADD 128 0.795586 0.279176
## 4: 8 22163524 rs4715 C A A ADD 255 1.074560 0.231312
## 5: 8 22164004 rs1124 G A A ADD 255 1.138140 0.229669
## 6: 10 79557289 rs1965708 C A A ADD 390 0.825855 0.182583
## 7: 10 79557536 rs1965707 C T T ADD 390 1.014570 0.162309
## 8: 10 79558907 rs17886395 G C C ADD 390 0.756156 0.126194
## 9: 10 79559458 rs1059046 A C C ADD 261 NA NA
## 10: 10 79611881 rs1059047 T C C ADD 263 NA NA
## 11: 10 79611973 rs1136450 G C C ADD 263 NA NA
## 12: 10 79612325 rs1136451 A G G ADD 392 1.242840 0.183442
## 13: 10 79613765 rs1059057 A G G ADD 384 1.058970 0.249147
## 14: 10 79614021 rs4253527 C T T ADD 391 1.389630 0.239280
## 15: 10 79941966 rs2243639 G A A ADD 269 NA NA
## 16: 10 79946568 rs721917 T C C ADD 400 1.234620 0.151413
## Z_STAT P
## 1: -0.2884080 0.7730340
## 2: -1.7920900 0.0731185
## 3: -0.8191110 0.4127230
## 4: 0.3108910 0.7558840
## 5: 0.5633940 0.5731670
## 6: -1.0479400 0.2946660
## 7: 0.0891045 0.9289990
## 8: -2.2149100 0.0267662
## 9: NA NA
## 10: NA NA
## 11: NA NA
## 12: 1.1851000 0.2359780
## 13: 0.2299910 0.8180990
## 14: 1.3751200 0.1690960
## 15: NA NA
## 16: 1.3919600 0.1639360
For the results I show both the hard data for each model as well as a quantile-quantile plot and manhattan plot. For the latter I set the significance equal to the standard 0.05 and divided by the number of variants (16). This is then plotted as the -log10 of the P-Value for each SNP and shown on the Y-axis. This is the level at which a SNP becomes significant in our analysis.
qq(results$P, cex=1, main="QQ-Plot")
manhattan(results,chr="CHR",bp="BP",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Normal Covariates")
As none of the SNPs show significance we move on to a systematic removal of covariates in order to see if this changes the significance of our results.
Fisrt we look at the model without the Sex covariate only.
MinusSex <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/MinusAge.PHENO1.glm.logistic",
sep="\t", header=T)
MinusSex
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## 1: 2 85666618 rs1130866 T C C ADD 254 0.948712 0.205275
## 2: 2 85667185 rs3024798 C A A ADD 128 0.576672 0.302266
## 3: 2 85668215 rs2077079 A C C ADD 128 0.795599 0.278560
## 4: 8 22163524 rs4715 C A A ADD 255 1.068810 0.230825
## 5: 8 22164004 rs1124 G A A ADD 255 1.115600 0.227948
## 6: 10 79557289 rs1965708 C A A ADD 390 0.827922 0.182307
## 7: 10 79557536 rs1965707 C T T ADD 390 1.018450 0.161078
## 8: 10 79558907 rs17886395 G C C ADD 390 0.757868 0.125969
## 9: 10 79559458 rs1059046 A C C ADD 261 NA NA
## 10: 10 79611881 rs1059047 T C C ADD 263 NA NA
## 11: 10 79611973 rs1136450 G C C ADD 263 NA NA
## 12: 10 79612325 rs1136451 A G G ADD 392 1.243010 0.183416
## 13: 10 79613765 rs1059057 A G G ADD 384 1.057330 0.248981
## 14: 10 79614021 rs4253527 C T T ADD 391 1.391180 0.238670
## 15: 10 79941966 rs2243639 G A A ADD 269 NA NA
## 16: 10 79946568 rs721917 T C C ADD 400 1.237880 0.151189
## Z_STAT P
## 1: -0.256484 0.7975770
## 2: -1.821190 0.0685784
## 3: -0.820862 0.4117250
## 4: 0.288282 0.7731310
## 5: 0.479917 0.6312870
## 6: -1.035820 0.3002880
## 7: 0.113501 0.9096340
## 8: -2.200900 0.0277429
## 9: NA NA
## 10: NA NA
## 11: NA NA
## 12: 1.186000 0.2356210
## 13: 0.223902 0.8228340
## 14: 1.383290 0.1665760
## 15: NA NA
## 16: 1.411490 0.1580990
qq(MinusSex$P, cex=1, main="QQ-Plot")
manhattan(MinusSex,chr="#CHROM",bp="POS",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Age and Ancestry")
## Warning in manhattan(MinusSex, chr = "#CHROM", bp = "POS", p = "P", snp =
## "SNP", : No SNP column found. OK unless you're trying to highlight.
Next we look at the model without the Age covariate only.
MinusAge <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/MinusSex.PHENO1.glm.logistic",
sep="\t", header=T)
MinusAge
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## 1: 2 85666618 rs1130866 T C C ADD 254 0.935605 0.205699
## 2: 2 85667185 rs3024798 C A A ADD 128 0.578317 0.302508
## 3: 2 85668215 rs2077079 A C C ADD 128 0.792010 0.278736
## 4: 8 22163524 rs4715 C A A ADD 255 1.085560 0.230701
## 5: 8 22164004 rs1124 G A A ADD 255 1.148210 0.228619
## 6: 10 79557289 rs1965708 C A A ADD 390 0.831288 0.182037
## 7: 10 79557536 rs1965707 C T T ADD 390 1.018560 0.162034
## 8: 10 79558907 rs17886395 G C C ADD 390 0.755126 0.126119
## 9: 10 79559458 rs1059046 A C C ADD 261 NA NA
## 10: 10 79611881 rs1059047 T C C ADD 263 NA NA
## 11: 10 79611973 rs1136450 G C C ADD 263 NA NA
## 12: 10 79612325 rs1136451 A G G ADD 392 1.242940 0.183421
## 13: 10 79613765 rs1059057 A G G ADD 384 1.059340 0.249145
## 14: 10 79614021 rs4253527 C T T ADD 391 1.389000 0.239199
## 15: 10 79941966 rs2243639 G A A ADD 269 NA NA
## 16: 10 79946568 rs721917 T C C ADD 400 1.234390 0.151381
## Z_STAT P
## 1: -0.323591 0.7462480
## 2: -1.810310 0.0702473
## 3: -0.836564 0.4028370
## 4: 0.355867 0.7219400
## 5: 0.604503 0.5455090
## 6: -1.015060 0.3100760
## 7: 0.113477 0.9096520
## 8: -2.227030 0.0259453
## 9: NA NA
## 10: NA NA
## 11: NA NA
## 12: 1.185700 0.2357410
## 13: 0.231362 0.8170340
## 14: 1.373700 0.1695360
## 15: NA NA
## 16: 1.391030 0.1642160
qq(MinusAge$P, cex=1, main="QQ-Plot")
manhattan(MinusAge,chr="#CHROM",bp="POS",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Sex and Ancestry")
## Warning in manhattan(MinusAge, chr = "#CHROM", bp = "POS", p = "P", snp =
## "SNP", : No SNP column found. OK unless you're trying to highlight.
Then we look at the model without both Sex and Age.
MinusSexandAge <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/MinusSexandAge.PHENO1.glm.logistic",
sep="\t", header=T)
MinusSexandAge
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## 1: 2 85666618 rs1130866 T C C ADD 254 0.941023 0.204836
## 2: 2 85667185 rs3024798 C A A ADD 128 0.573536 0.301799
## 3: 2 85668215 rs2077079 A C C ADD 128 0.791666 0.278062
## 4: 8 22163524 rs4715 C A A ADD 255 1.078470 0.230226
## 5: 8 22164004 rs1124 G A A ADD 255 1.125570 0.226898
## 6: 10 79557289 rs1965708 C A A ADD 390 0.833586 0.181736
## 7: 10 79557536 rs1965707 C T T ADD 390 1.022750 0.160758
## 8: 10 79558907 rs17886395 G C C ADD 390 0.756856 0.125902
## 9: 10 79559458 rs1059046 A C C ADD 261 NA NA
## 10: 10 79611881 rs1059047 T C C ADD 263 NA NA
## 11: 10 79611973 rs1136450 G C C ADD 263 NA NA
## 12: 10 79612325 rs1136451 A G G ADD 392 1.243130 0.183393
## 13: 10 79613765 rs1059057 A G G ADD 384 1.057700 0.248975
## 14: 10 79614021 rs4253527 C T T ADD 391 1.390620 0.238596
## 15: 10 79941966 rs2243639 G A A ADD 269 NA NA
## 16: 10 79946568 rs721917 T C C ADD 400 1.237710 0.151152
## Z_STAT P
## 1: -0.296761 0.7666490
## 2: -1.842070 0.0654645
## 3: -0.840156 0.4008210
## 4: 0.328142 0.7428040
## 5: 0.521347 0.6021250
## 6: -1.001550 0.3165610
## 7: 0.139934 0.8887120
## 8: -2.212690 0.0269188
## 9: NA NA
## 10: NA NA
## 11: NA NA
## 12: 1.186680 0.2353550
## 13: 0.225297 0.8217480
## 14: 1.382040 0.1669590
## 15: NA NA
## 16: 1.410890 0.1582770
qq(MinusSexandAge$P, cex=1, main="QQ-Plot")
manhattan(MinusSexandAge,chr="#CHROM",bp="POS",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Ancestry Only")
## Warning in manhattan(MinusSexandAge, chr = "#CHROM", bp = "POS", p = "P", : No
## SNP column found. OK unless you're trying to highlight.
Finally we remove all covariates from the model.
NoCov <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/NoCov.PHENO1.glm.logistic",
sep="\t", header=T)
NoCov
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## 1: 2 85666618 rs1130866 T C C ADD 254 0.947127 0.199107
## 2: 2 85667185 rs3024798 C A A ADD 128 0.540041 0.292559
## 3: 2 85668215 rs2077079 A C C ADD 128 0.727546 0.270009
## 4: 8 22163524 rs4715 C A A ADD 255 1.033200 0.225067
## 5: 8 22164004 rs1124 G A A ADD 255 1.041420 0.218707
## 6: 10 79557289 rs1965708 C A A ADD 391 0.874637 0.178413
## 7: 10 79557536 rs1965707 C T T ADD 391 1.047800 0.158849
## 8: 10 79558907 rs17886395 G C C ADD 391 0.762683 0.124702
## 9: 10 79559458 rs1059046 A C C ADD 262 0.935887 0.187012
## 10: 10 79611881 rs1059047 T C C ADD 264 1.329150 0.319219
## 11: 10 79611973 rs1136450 G C C ADD 264 0.900438 0.181578
## 12: 10 79612325 rs1136451 A G G ADD 393 1.262640 0.180970
## 13: 10 79613765 rs1059057 A G G ADD 385 1.075390 0.246670
## 14: 10 79614021 rs4253527 C T T ADD 392 1.395450 0.236007
## 15: 10 79941966 rs2243639 G A A ADD 270 0.654964 0.190580
## 16: 10 79946568 rs721917 T C C ADD 401 1.260120 0.149840
## Z_STAT P
## 1: -0.272830 0.7849840
## 2: -2.105930 0.0352101
## 3: -1.178030 0.2387860
## 4: 0.145123 0.8846140
## 5: 0.185548 0.8527990
## 6: -0.750767 0.4527930
## 7: 0.293919 0.7688200
## 8: -2.172490 0.0298191
## 9: -0.354310 0.7231070
## 10: 0.891361 0.3727350
## 11: -0.577569 0.5635550
## 12: 1.288640 0.1975240
## 13: 0.294649 0.7682620
## 14: 1.411880 0.1579850
## 15: -2.220460 0.0263879
## 16: 1.543050 0.1228190
qq(NoCov$P, cex=1, main="QQ-Plot")
manhattan(NoCov,chr="#CHROM",bp="POS",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot No Covariates")
## Warning in manhattan(NoCov, chr = "#CHROM", bp = "POS", p = "P", snp = "SNP", :
## No SNP column found. OK unless you're trying to highlight.
As the models continue to show a lack of significance in terms of the SNPs I then moved on to including an interaction term(s) in the models.
First I tried a basic Sex*Age interaction which is commonly used in GWAS/Meta-Analysis.
SexAgeInt <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Interaction.PHENO1.glm.logistic",
sep="\t", header=T)
SexAgeInt
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## 1: 2 85666618 rs1130866 T C C ADD 254 0.938388 0.206162
## 2: 2 85667185 rs3024798 C A A ADD 128 0.575510 0.304426
## 3: 2 85668215 rs2077079 A C C ADD 128 0.786036 0.281829
## 4: 8 22163524 rs4715 C A A ADD 255 1.071150 0.232011
## 5: 8 22164004 rs1124 G A A ADD 255 1.139410 0.229819
## 6: 10 79557289 rs1965708 C A A ADD 390 0.825296 0.182629
## 7: 10 79557536 rs1965707 C T T ADD 390 1.014670 0.162358
## 8: 10 79558907 rs17886395 G C C ADD 390 0.755542 0.126237
## 9: 10 79559458 rs1059046 A C C ADD 261 NA NA
## 10: 10 79611881 rs1059047 T C C ADD 263 NA NA
## 11: 10 79611973 rs1136450 G C C ADD 263 NA NA
## 12: 10 79612325 rs1136451 A G G ADD 392 1.243180 0.183408
## 13: 10 79613765 rs1059057 A G G ADD 384 1.057800 0.249148
## 14: 10 79614021 rs4253527 C T T ADD 391 1.391460 0.239253
## 15: 10 79941966 rs2243639 G A A ADD 269 NA NA
## 16: 10 79946568 rs721917 T C C ADD 400 1.236180 0.151476
## Z_STAT P
## 1: -0.3084540 0.7577370
## 2: -1.8148900 0.0695407
## 3: -0.8542480 0.3929680
## 4: 0.2962450 0.7670430
## 5: 0.5678660 0.5701260
## 6: -1.0513900 0.2930810
## 7: 0.0897176 0.9285120
## 8: -2.2205800 0.0263791
## 9: NA NA
## 10: NA NA
## 11: NA NA
## 12: 1.1868100 0.2353040
## 13: 0.2255210 0.8215740
## 14: 1.3807600 0.1673530
## 15: NA NA
## 16: 1.3997400 0.1615910
qq(SexAgeInt$P, cex=1, main="QQ-Plot")
manhattan(SexAgeInt,chr="#CHROM",bp="POS",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Sex*Age Interaction")
## Warning in manhattan(SexAgeInt, chr = "#CHROM", bp = "POS", p = "P", snp =
## "SNP", : No SNP column found. OK unless you're trying to highlight.
Moving on from the Sex*Age interaction I decided to try PLINK’s interaction function. In this function PLINK generates a swath of interations between each of the covariates and then filters based on multicollinearity and low impact.
Interact <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Interaction2.PHENO1.glm.logistic",
sep="\t", header=T)
Interact
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR
## 1: 2 85666618 rs1130866 T C C ADD 254 1.28271
## 2: 2 85666618 rs1130866 T C C Sex 254 1.09251
## 3: 2 85666618 rs1130866 T C C Ancestry_6 254 2.30183
## 4: 2 85666618 rs1130866 T C C Ancestry_3 254 6.99580
## 5: 2 85666618 rs1130866 T C C Ancestry_5 254 1.47750
## ---
## 204: 10 79946568 rs721917 T C C ADDxAncestry_6 400 NA
## 205: 10 79946568 rs721917 T C C ADDxAncestry_3 400 NA
## 206: 10 79946568 rs721917 T C C ADDxAncestry_5 400 NA
## 207: 10 79946568 rs721917 T C C ADDxAncestry_1 400 NA
## 208: 10 79946568 rs721917 T C C ADDxAge 400 NA
## LOG(OR)_SE Z_STAT P
## 1: 0.789424 0.315384 0.752470
## 2: 0.509691 0.173587 0.862190
## 3: 0.791885 1.052810 0.292429
## 4: 2.601760 0.747691 0.454647
## 5: 1.468810 0.265759 0.790425
## ---
## 204: NA NA NA
## 205: NA NA NA
## 206: NA NA NA
## 207: NA NA NA
## 208: NA NA NA
qq(Interact$P, cex=1, main="QQ-Plot")
manhattan(Interact,chr="#CHROM",bp="POS",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Full PLINK Interaction")
## Warning in manhattan(Interact, chr = "#CHROM", bp = "POS", p = "P", snp =
## "SNP", : No SNP column found. OK unless you're trying to highlight.
Finally I tried a basic association model (case/control) with the covariates, followed by an association without covariates to see if any significant SNPs could be detected.
Assoc1 <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Association.assoc",
sep=" ", header=T)
qq(Assoc1$P, cex=1, main="QQ-Plot")
manhattan(Assoc1,chr="CHR",bp="BP",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Association Model w/ Covariates")
Assoc2 <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/Association2.assoc",
sep=" ", header=T)
qq(Assoc2$P, cex=1, main="QQ-Plot")
manhattan(Assoc2,chr="CHR",bp="BP",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Association Model w/o Covariates")
UpCovGLM <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/UpdatedCovariates.PHENO1.glm.logistic",
sep="\t", header=T)
UpCovAssoc <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/UpdatedCovariatesAssoc.assoc",
sep=" ", header=T)
qq(UpCovGLM$P, cex=1, main="QQ-Plot")
manhattan(UpCovGLM,chr="#CHROM",bp="POS",p="P",snp="ID",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Association Model w/ Updated Covariates")
qq(UpCovAssoc$P, cex=1, main="QQ-Plot")
manhattan(UpCovAssoc,chr="CHR",bp="BP",p="P",snp="SNP",col=c("orange","purple"),chrlabs=NULL,annotatePval=NULL,annotateTop=TRUE, ylim=c(0,4), suggestiveline=-log10(0.05/16), main="Manhattan Plot Association Model w/ Updated Covariates")
Analysis: Unfortunately as we can see from each of the models none of the SNPs recorded show a P-Value equal to, or less than our cutoff which suggests that there are no significant associations between the SNPs recorded and the outcomes for RSV. The covariates used seemed only usefull in the logistic models used, but generally speaking those models failed to produce results for particular SNPs, most commonly rs3024798 & rs2077079. According to PLINK, NA’s are the result of multicollinearity; “A multicollinearity check is performed before each regression. When it fails, the regression is skipped and ‘NA’ results are reported”. However this was not seen in the rs1059057/rs141764116, most likely due to the removal of rs141764116. As for the association models, it seems these performed the “best” in terms of SNPs being closest to significance but these models are rarely used anymore with linear/logistic models replacing them. Overall a lack of covariates showed the “best” results in both model types (logistic/association), but again only in terms of “SNPs being closest to significance”.
For a model using the multi-allelic varients to predict patient outcomes I build new a dataset using dummy encoding for each of the multiallelic variants setting the varient “00” as (0,….,0) for both SP-A1 and SP-A2 genotypes.
MultiAllelic <- data.frame(cbind(#"ID"=Full_Data$`Genotype number`,
"Sex"=Full_Data$Gender,
"Age"=Full_Data$`Age in Months`,
"Asian"=Full_Data$Asian,
"Black"=Full_Data$Black,
"Hispanic"=Full_Data$Hispanic,
"Mix"=Full_Data$Mix,
"Other"=Full_Data$Other,
"Smoking"=Full_Data$`Smoking exposure`,
"Pet"=Full_Data$`Pet exposure`,
"Infection"=Full_Data$`Associated infection`,
"Outcome"= ifelse(Full_Data$Phenotype==1,0,1)))
MultiAllelic$`V1A1A` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A/1A", 1, 0)
MultiAllelic$`V1A01A3` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A0/1A3", 1, 0)
MultiAllelic$`V1A1A5` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A/1A5", 1, 0)
MultiAllelic$`V1A01A9` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A0/1A9", 1, 0)
MultiAllelic$`V1A01A0` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A0/1A0", 1, 0)
MultiAllelic$`V1A1A0` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A/1A0", 1, 0)
MultiAllelic$`V1A1A1` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A/1A1", 1, 0)
MultiAllelic$`V1A01A2` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A0/1A2", 1, 0)
MultiAllelic$`V1A01A5` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A0/1A5", 1, 0)
MultiAllelic$`V1A11A1` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A1/1A1", 1, 0)
MultiAllelic$`V1A31A5` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A3/1A5", 1, 0)
MultiAllelic$`V1A11A2` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A1/1A2", 1, 0)
MultiAllelic$`V1A1A2` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A/1A2", 1, 0)
MultiAllelic$`V1A11A3` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A1/1A3", 1, 0)
MultiAllelic$`V1A11A5` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A1/1A5", 1, 0)
MultiAllelic$`V1A01A8` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A0/1A8", 1, 0)
MultiAllelic$`V1A1A3` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A/1A3", 1, 0)
MultiAllelic$`V1A21A2` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A2/1A2", 1, 0)
MultiAllelic$`V1A21A5` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A2/1A5", 1, 0)
MultiAllelic$`V1A51A10` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A5/1A10", 1, 0)
MultiAllelic$`V1A51A5` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A5/1A5", 1, 0)
MultiAllelic$`V1A11A10` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A1/1A10", 1, 0)
MultiAllelic$`V1A01A12` <- ifelse(Full_Data$`SP-A2 Genotype` == "1A0/1A12", 1, 0)
MultiAllelic$`VNE2` <- ifelse(Full_Data$`SP-A2 Genotype` == "NE2", 1, 0)
MultiAllelic$`VNE1` <- ifelse(Full_Data$`SP-A2 Genotype` == "NE1", 1, 0)
MultiAllelic$`VNE3` <- ifelse(Full_Data$`SP-A2 Genotype` == "NE3", 1, 0)
MultiAllelic$`VNE10` <- ifelse(Full_Data$`SP-A2 Genotype` == "NE10", 1, 0)
MultiAllelic$`V6A26A3` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A2/6A3", 1, 0)
MultiAllelic$`V6A6A4` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A/6A4", 1, 0)
MultiAllelic$`V6A6A` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A/6A", 1, 0)
MultiAllelic$`V6A26A2` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A2/6A2", 1, 0)
MultiAllelic$`V6A6A2` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A/6A2", 1, 0)
MultiAllelic$`V6A46A4` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A4/6A4", 1, 0)
MultiAllelic$`V6A6A3` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A2/6A3", 1, 0)
MultiAllelic$`V6A26A4` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A2/6A4", 1, 0)
MultiAllelic$`V6A36A3` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A3/6A3", 1, 0)
MultiAllelic$`V6A26A7V6A36A6` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A2/6A7 OR 6A3/6A6", 1, 0)
MultiAllelic$`V6A36A4` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A3/6A4", 1, 0)
MultiAllelic$`V6A126A13V6A26A18` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A12/6A13 OR 6A2/6A18", 1, 0)
MultiAllelic$`V6A26A8` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A2/6A8", 1, 0)
MultiAllelic$`V6A36A14` <- ifelse(Full_Data$`SP-A1 Genotype` == "6A3/6A14", 1, 0)
MultiAllelic <- sapply(MultiAllelic, as.numeric)
MultiAllelic <- as.data.frame(MultiAllelic)
MultiAllelicLM <- glm(Outcome ~ ., data=MultiAllelic, family=binomial)
summary(MultiAllelicLM)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = MultiAllelic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4333 -0.9254 0.3008 0.8242 2.2002
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.098e-01 4.280e-01 -0.724 0.46913
## Sex -9.882e-03 2.531e-01 -0.039 0.96886
## Age -1.217e-01 2.932e-02 -4.149 3.34e-05 ***
## Asian 1.897e+00 2.289e+00 0.829 0.40737
## Black 7.185e-01 4.829e-01 1.488 0.13676
## Hispanic -8.534e-03 6.679e-01 -0.013 0.98981
## Mix NA NA NA NA
## Other -8.636e-01 7.991e-01 -1.081 0.27980
## Smoking 1.137e-01 4.288e-01 0.265 0.79088
## Pet 1.115e+00 4.562e-01 2.445 0.01450 *
## Infection 1.380e+00 2.789e-01 4.949 7.46e-07 ***
## V1A1A 1.833e+01 2.752e+03 0.007 0.99469
## V1A01A3 1.971e+00 9.608e-01 2.052 0.04019 *
## V1A1A5 -1.291e+01 3.956e+03 -0.003 0.99740
## V1A01A9 1.887e+01 1.971e+03 0.010 0.99236
## V1A01A0 1.601e+00 6.099e-01 2.626 0.00864 **
## V1A1A0 4.501e-01 8.869e-01 0.508 0.61179
## V1A1A1 1.492e-01 9.451e-01 0.158 0.87454
## V1A01A2 7.593e-01 7.344e-01 1.034 0.30122
## V1A01A5 8.683e-01 1.147e+00 0.757 0.44918
## V1A11A1 1.424e+00 1.276e+00 1.116 0.26459
## V1A31A5 1.665e+01 2.384e+03 0.007 0.99443
## V1A11A2 1.582e+00 1.297e+00 1.221 0.22227
## V1A1A2 1.552e+00 1.337e+00 1.161 0.24545
## V1A11A3 1.667e-01 1.326e+00 0.126 0.89994
## V1A11A5 1.321e+00 1.508e+00 0.876 0.38114
## V1A01A8 1.524e+01 3.956e+03 0.004 0.99693
## V1A1A3 1.563e+00 1.349e+00 1.158 0.24686
## V1A21A2 1.777e+01 3.956e+03 0.004 0.99642
## V1A21A5 1.752e+00 1.539e+00 1.138 0.25499
## V1A51A10 -9.203e-01 2.257e+00 -0.408 0.68351
## V1A51A5 1.689e+01 3.956e+03 0.004 0.99659
## V1A11A10 1.659e+01 3.956e+03 0.004 0.99665
## V1A01A12 -1.406e+01 3.956e+03 -0.004 0.99716
## VNE2 1.756e+01 3.956e+03 0.004 0.99646
## VNE1 2.232e+00 7.399e-01 3.017 0.00256 **
## VNE3 -1.798e+01 3.956e+03 -0.005 0.99637
## VNE10 -1.839e+01 3.956e+03 -0.005 0.99629
## V6A26A3 -5.579e-01 4.472e-01 -1.247 0.21226
## V6A6A4 1.640e+01 3.956e+03 0.004 0.99669
## V6A6A NA NA NA NA
## V6A26A2 -8.860e-01 6.336e-01 -1.398 0.16204
## V6A6A2 5.402e-01 1.011e+00 0.534 0.59327
## V6A46A4 2.087e-01 5.595e+03 0.000 0.99997
## V6A6A3 NA NA NA NA
## V6A26A4 4.417e-01 9.388e-01 0.470 0.63803
## V6A36A3 -8.882e-01 1.019e+00 -0.872 0.38335
## V6A26A7V6A36A6 1.617e+01 2.796e+03 0.006 0.99539
## V6A36A4 -1.319e+00 1.127e+00 -1.170 0.24211
## V6A126A13V6A26A18 1.557e+01 3.956e+03 0.004 0.99686
## V6A26A8 -1.636e+00 1.691e+00 -0.967 0.33351
## V6A36A14 -3.369e+01 5.595e+03 -0.006 0.99520
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 548.30 on 401 degrees of freedom
## Residual deviance: 408.99 on 353 degrees of freedom
## AIC: 506.99
##
## Number of Fisher Scoring iterations: 16
MAodds <- exp(MultiAllelicLM$coefficients)
MALMData <- as.data.frame(cbind("Name"=matrix(names(MultiAllelicLM$coefficients), ncol=1)[-c(7,41,45)],
"OR"=na.omit(unname(MAodds)),
"P"=matrix(coef(summary(MultiAllelicLM))[,4], ncol=1)))
colnames(MALMData)[colnames(MALMData) == "V2"] <- "Beta"
colnames(MALMData)[colnames(MALMData) == "V3"] <- "P"
MALMData
## Name OR P
## 1 (Intercept) 0.733574727167779 0.469131924284838
## 2 Sex 0.990166655219187 0.968860548612354
## 3 Age 0.885441312809662 3.33815041443129e-05
## 4 Asian 6.66523537720427 0.407372978942602
## 5 Black 2.05144300430373 0.136761783878859
## 6 Hispanic 0.991502291072766 0.989805473441694
## 7 Other 0.421630599951538 0.279796493066583
## 8 Smoking 1.12042468238592 0.790884581779894
## 9 Pet 3.04995001449746 0.0144996719377657
## 10 Infection 3.97650617719077 7.46031894325567e-07
## 11 V1A1A 91044965.3903043 0.994685835521043
## 12 V1A01A3 7.18014164763176 0.0401861699941062
## 13 V1A1A5 2.48340028330359e-06 0.997397139928527
## 14 V1A01A9 156669638.577843 0.9923627566009
## 15 V1A01A0 4.96005504027756 0.00864199925188129
## 16 V1A1A0 1.56847368225219 0.611792248930491
## 17 V1A1A1 1.16093842602393 0.874540077257852
## 18 V1A01A2 2.13671339073284 0.301216164952109
## 19 V1A01A5 2.38284407507275 0.449179689452193
## 20 V1A11A1 4.15214040367493 0.264585252813042
## 21 V1A31A5 16997396.5369509 0.994427821168342
## 22 V1A11A2 4.86656505120099 0.222273743165493
## 23 V1A1A2 4.72278242746876 0.245454701468817
## 24 V1A11A3 1.18142165856906 0.899944288954544
## 25 V1A11A5 3.74730117315884 0.381143988316751
## 26 V1A01A8 4145432.1204602 0.996926896987262
## 27 V1A1A3 4.77150268059078 0.246856633512277
## 28 V1A21A2 51990237.3101743 0.996416841553639
## 29 V1A21A5 5.7641402770072 0.254985593466912
## 30 V1A51A10 0.398405970909073 0.68351451548207
## 31 V1A51A5 21689096.0142399 0.996593158221562
## 32 V1A11A10 15998726.5715601 0.996654529182827
## 33 V1A01A12 7.79940678677827e-07 0.997163561326784
## 34 VNE2 42360533.6106353 0.996458153120629
## 35 VNE1 9.31772160285101 0.00255678868979668
## 36 VNE3 1.56141881974383e-08 0.996374787593315
## 37 VNE10 1.02803546098812e-08 0.996290497046979
## 38 V6A26A3 0.5724372137033 0.212257581809937
## 39 V6A6A4 13310912.4511653 0.996691623088942
## 40 V6A26A2 0.412321888135349 0.16203628705383
## 41 V6A6A2 1.71637361441419 0.593274047726458
## 42 V6A46A4 1.23209677310105 0.99997023487875
## 43 V6A26A4 1.55528998472033 0.638032093725488
## 44 V6A36A3 0.411385235159794 0.383346424584058
## 45 V6A26A7V6A36A6 10485769.2881643 0.995386216510856
## 46 V6A36A4 0.267505012024327 0.242105038624873
## 47 V6A126A13V6A26A18 5792287.21463142 0.996859431366812
## 48 V6A26A8 0.194853670590083 0.333507932175189
## 49 V6A36A14 2.33361437760947e-15 0.995195315503984
From these we results we find significance in varients NE1, 1A0/1A0 & 1A0/1A3 with additional significance in covariates Age, Pet and Infection. Specifically if we look at the variants, NE1 is considered risky, 1A0/1A0 is risky and 1A0/1A3 is also risky based off their calculated odds ratios as each of their beta estimates are > 1.
AgebyOutcome <- data.table(cbind(MultiAllelic$Age, MultiAllelic$Outcome))
Deaths <- AgebyOutcome %>% filter( V2 == "1")
Lived <- AgebyOutcome %>% filter( V2 == "0")
hist(Lived$V1, main="Histogram of Patients Who Lived", xlab=paste0("Age (Average: ",mean(Lived$V1),")"))
abline(v=mean(Lived$V1), col="red")
hist(Deaths$V1, main="Histogram of Patients Who Died", xlab=paste0("Age (Average: ",mean(Deaths$V1),")"))
abline(v=mean(Deaths$V1), col="red")
a <- c()
Varients <- colnames(MultiAllelic[12:52])
for (i in colnames(MultiAllelic[12:52])){
x <- paste0("Outcome ~ ", i, " + Sex + Age + Asian + Black + Hispanic + Mix + Other + Smoking + Pet + Infection")
individualmodel <- glm(as.formula(x), data=MultiAllelic, family=binomial)
a <- c(a, coef(summary(individualmodel))[2,4])
}
IndMod <- as.data.frame(cbind("Varients"=Varients, "PValues"=a))
IndMod %>% arrange(PValues)
## Varients PValues
## 1 V6A6A4 0.058197709196178
## 2 VNE1 0.0669266891379583
## 3 V6A26A4 0.0678367587102841
## 4 V1A1A5 0.0855380788043218
## 5 V6A6A2 0.217923904652174
## 6 V1A1A2 0.224392669340978
## 7 V6A26A2 0.255590265278709
## 8 V1A1A0 0.256458866664515
## 9 V1A01A0 0.263780039371406
## 10 V1A51A10 0.302922483500094
## 11 V6A26A3 0.304763863984181
## 12 V6A6A3 0.304763863984181
## 13 V1A01A3 0.322612193398784
## 14 V1A11A3 0.420720174653368
## 15 V6A36A4 0.456146240600145
## 16 V1A1A3 0.494839954047621
## 17 V1A01A5 0.564471727861534
## 18 V1A11A10 0.604310699575505
## 19 V6A26A8 0.684752346166993
## 20 V1A1A1 0.719268798623756
## 21 V1A11A5 0.76963343970846
## 22 V1A11A1 0.907849270476197
## 23 V6A36A3 0.968301563216385
## 24 VNE3 0.977781121145797
## 25 VNE10 0.977969223881147
## 26 V1A21A5 0.97865391817507
## 27 VNE2 0.979699059851026
## 28 V6A46A4 0.979859765517212
## 29 V6A36A14 0.97994555111399
## 30 V1A1A 0.980542559508865
## 31 V6A6A 0.980542559508865
## 32 V1A21A2 0.981272692441606
## 33 V1A51A5 0.981704717258427
## 34 V6A126A13V6A26A18 0.982374761087439
## 35 V1A01A9 0.98251770604902
## 36 V1A01A12 0.982717575732278
## 37 V6A26A7V6A36A6 0.983107703836453
## 38 V1A01A8 0.985180654098441
## 39 V1A31A5 0.985765357406388
## 40 V1A11A2 0.989918405389124
## 41 V1A01A2 0.998531733598324
When looking at individual tests however we find that there is no significance in any of the variants.
exp(na.omit(confint.default(MultiAllelicLM, level=0.95)))
## 2.5 % 97.5 %
## (Intercept) 0.317051752 1.6972998
## Sex 0.602883584 1.6262344
## Age 0.835985684 0.9378227
## Asian 0.074988289 592.4306706
## Black 0.796176848 5.2857834
## Hispanic 0.267776059 3.6712647
## Other 0.088055198 2.0188742
## Smoking 0.483467498 2.5965581
## Pet 1.247430156 7.4570869
## Infection 2.301858865 6.8694921
## V1A1A 0.000000000 Inf
## V1A01A3 1.092273694 47.1991904
## V1A1A5 0.000000000 Inf
## V1A01A9 0.000000000 Inf
## V1A01A0 1.500989306 16.3906204
## V1A1A0 0.275784299 8.9204124
## V1A1A1 0.182105806 7.4010712
## V1A01A2 0.506527868 9.0134115
## V1A01A5 0.251460735 22.5798508
## V1A11A1 0.340460755 50.6380535
## V1A31A5 0.000000000 Inf
## V1A11A2 0.383386319 61.7743885
## V1A1A2 0.343945169 64.8495047
## V1A11A3 0.087846321 15.8886236
## V1A11A5 0.194882285 72.0551182
## V1A01A8 0.000000000 Inf
## V1A1A3 0.338860213 67.1876985
## V1A21A2 0.000000000 Inf
## V1A21A5 0.282424449 117.6431900
## V1A51A10 0.004773236 33.2536062
## V1A51A5 0.000000000 Inf
## V1A11A10 0.000000000 Inf
## V1A01A12 0.000000000 Inf
## VNE2 0.000000000 Inf
## VNE1 2.185299444 39.7290797
## VNE3 0.000000000 Inf
## VNE10 0.000000000 Inf
## V6A26A3 0.238261978 1.3753112
## V6A6A4 0.000000000 Inf
## V6A26A2 0.119098222 1.4274717
## V6A6A2 0.236408371 12.4612270
## V6A46A4 0.000000000 Inf
## V6A26A4 0.247001959 9.7931488
## V6A36A3 0.055841530 3.0306801
## V6A26A7V6A36A6 0.000000000 Inf
## V6A36A4 0.029362708 2.4370685
## V6A126A13V6A26A18 0.000000000 Inf
## V6A26A8 0.007082098 5.3611165
## V6A36A14 0.000000000 Inf
## attr(,"na.action")
## Mix V6A6A V6A6A3
## 7 41 45
## attr(,"class")
## [1] "omit"
For Individual tests we found nothing of significance for multi-allelic variants.
fullmodel <- glm(Outcome ~ ., data=MultiAllelic, family=binomial)
covmodel <- glm(Outcome ~ Sex + Age + Black + Asian + Hispanic + Other + Mix + Smoking + Pet + Infection, data=MultiAllelic, family=binomial)
lrtest(fullmodel, covmodel)
## Likelihood ratio test
##
## Model 1: Outcome ~ Sex + Age + Asian + Black + Hispanic + Mix + Other +
## Smoking + Pet + Infection + V1A1A + V1A01A3 + V1A1A5 + V1A01A9 +
## V1A01A0 + V1A1A0 + V1A1A1 + V1A01A2 + V1A01A5 + V1A11A1 +
## V1A31A5 + V1A11A2 + V1A1A2 + V1A11A3 + V1A11A5 + V1A01A8 +
## V1A1A3 + V1A21A2 + V1A21A5 + V1A51A10 + V1A51A5 + V1A11A10 +
## V1A01A12 + VNE2 + VNE1 + VNE3 + VNE10 + V6A26A3 + V6A6A4 +
## V6A6A + V6A26A2 + V6A6A2 + V6A46A4 + V6A6A3 + V6A26A4 + V6A36A3 +
## V6A26A7V6A36A6 + V6A36A4 + V6A126A13V6A26A18 + V6A26A8 +
## V6A36A14
## Model 2: Outcome ~ Sex + Age + Black + Asian + Hispanic + Other + Mix +
## Smoking + Pet + Infection
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 49 -204.50
## 2 10 -233.22 -39 57.445 0.02865 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I performed the likelihood ratio test to compare the fit of the two models, i.e. does the data fit each of the two models equally well, and from this we see that with a Chisq of 57.445 and P-Value of 0.02865, that we reject the null hypothesis that the two are equal. Therefore, the model with the additional multi-allelic variants is a better fit and offers significant improvement over the model with only the covarites as our predictors.
# plink --file RSVData --freq --mind 1 --nonfounders
Freq <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/plink.frq", sep=" ", header=T)
Freq
## CHR SNP A1 A2 MAF NCHROBS
## 1: 2 rs1130866 C T 0.47050 508
## 2: 2 rs3024798 A C 0.31250 256
## 3: 2 rs2077079 C A 0.40230 256
## 4: 8 rs4715 A C 0.25100 510
## 5: 8 rs1124 A G 0.30980 510
## 6: 10 rs1965708 A C 0.19820 782
## 7: 10 rs1965707 T C 0.27370 782
## 8: 10 rs17886395 C G 0.39900 782
## 9: 10 rs1059046 C A 0.42370 524
## 10: 10 rs1059047 C T 0.09848 528
## 11: 10 rs1136450 C G 0.44700 528
## 12: 10 rs1136451 G A 0.19080 786
## 13: 10 rs1059057 G A 0.10000 770
## 14: 10 rs4253527 T C 0.10080 784
## 15: 10 rs2243639 A G 0.35370 540
## 16: 10 rs721917 C T 0.42770 802
# plink2 --bfile RSVData --covar RSVCovariates.txt --glm hide-covar genotypic
DomRec <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/DOMDEV.PHENO1.glm.logistic", sep="\t", header=T)
DomRec
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## 1: 2 85666618 rs1130866 T C C ADD 254 NA NA
## 2: 2 85666618 rs1130866 T C C DOMDEV 254 NA NA
## 3: 2 85666618 rs1130866 T C C GENO_2DF 254 NA NA
## 4: 2 85667185 rs3024798 C A A ADD 128 0.544648 0.374128
## 5: 2 85667185 rs3024798 C A A DOMDEV 128 0.662937 0.474499
## 6: 2 85667185 rs3024798 C A A GENO_2DF 128 NA NA
## 7: 2 85668215 rs2077079 A C C ADD 128 0.656310 0.314192
## 8: 2 85668215 rs2077079 A C C DOMDEV 128 0.914694 0.401808
## 9: 2 85668215 rs2077079 A C C GENO_2DF 128 NA NA
## 10: 8 22163524 rs4715 C A A ADD 255 NA NA
## 11: 8 22163524 rs4715 C A A DOMDEV 255 NA NA
## 12: 8 22163524 rs4715 C A A GENO_2DF 255 NA NA
## 13: 8 22164004 rs1124 G A A ADD 255 NA NA
## 14: 8 22164004 rs1124 G A A DOMDEV 255 NA NA
## 15: 8 22164004 rs1124 G A A GENO_2DF 255 NA NA
## 16: 10 79557289 rs1965708 C A A ADD 390 0.905022 0.258953
## 17: 10 79557289 rs1965708 C A A DOMDEV 390 0.928540 0.320549
## 18: 10 79557289 rs1965708 C A A GENO_2DF 390 NA NA
## 19: 10 79557536 rs1965707 C T T ADD 390 1.071450 0.194704
## 20: 10 79557536 rs1965707 C T T DOMDEV 390 0.935901 0.258764
## 21: 10 79557536 rs1965707 C T T GENO_2DF 390 NA NA
## 22: 10 79558907 rs17886395 G C C ADD 390 0.748748 0.127965
## 23: 10 79558907 rs17886395 G C C DOMDEV 390 1.214360 0.234634
## 24: 10 79558907 rs17886395 G C C GENO_2DF 390 NA NA
## 25: 10 79559458 rs1059046 A C C ADD 261 0.947906 0.199640
## 26: 10 79559458 rs1059046 A C C DOMDEV 261 0.774079 0.277633
## 27: 10 79559458 rs1059046 A C C GENO_2DF 261 NA NA
## 28: 10 79611881 rs1059047 T C C ADD 263 NA NA
## 29: 10 79611881 rs1059047 T C C DOMDEV 263 NA NA
## 30: 10 79611881 rs1059047 T C C GENO_2DF 263 NA NA
## 31: 10 79611973 rs1136450 G C C ADD 263 0.878743 0.189456
## 32: 10 79611973 rs1136450 G C C DOMDEV 263 0.874953 0.264355
## 33: 10 79611973 rs1136450 G C C GENO_2DF 263 NA NA
## 34: 10 79612325 rs1136451 A G G ADD 392 1.456980 0.272894
## 35: 10 79612325 rs1136451 A G G DOMDEV 392 0.759206 0.334302
## 36: 10 79612325 rs1136451 A G G GENO_2DF 392 NA NA
## 37: 10 79613765 rs1059057 A G G ADD 384 1.187250 0.619605
## 38: 10 79613765 rs1059057 A G G DOMDEV 384 0.848591 0.666238
## 39: 10 79613765 rs1059057 A G G GENO_2DF 384 NA NA
## 40: 10 79614021 rs4253527 C T T ADD 391 1.198090 0.374249
## 41: 10 79614021 rs4253527 C T T DOMDEV 391 1.268500 0.457329
## 42: 10 79614021 rs4253527 C T T GENO_2DF 391 NA NA
## 43: 10 79941966 rs2243639 G A A ADD 269 0.628845 0.204708
## 44: 10 79941966 rs2243639 G A A DOMDEV 269 1.245520 0.282497
## 45: 10 79941966 rs2243639 G A A GENO_2DF 269 NA NA
## 46: 10 79946568 rs721917 T C C ADD 400 1.186520 0.155563
## 47: 10 79946568 rs721917 T C C DOMDEV 400 1.165110 0.212786
## 48: 10 79946568 rs721917 T C C GENO_2DF 400 NA NA
## #CHROM POS ID REF ALT A1 TEST OBS_CT OR LOG(OR)_SE
## Z_OR_F_STAT P
## 1: NA NA
## 2: NA NA
## 3: NA NA
## 4: -1.6240800 0.1043580
## 5: -0.8663360 0.3863060
## 6: 3.6117100 0.0297931
## 7: -1.3403300 0.1801380
## 8: -0.2219130 0.8243820
## 9: 1.1541900 0.3185730
## 10: NA NA
## 11: NA NA
## 12: NA NA
## 13: NA NA
## 14: NA NA
## 15: NA NA
## 16: -0.3853830 0.6999540
## 17: -0.2312960 0.8170850
## 18: 0.3349570 0.7155750
## 19: 0.3544730 0.7229850
## 20: -0.2560090 0.7979440
## 21: 0.0652580 0.9368360
## 22: -2.2611800 0.0237480
## 23: 0.8277450 0.4078150
## 24: 2.6924900 0.0689706
## 25: -0.2679820 0.7887130
## 26: -0.9223730 0.3563340
## 27: 0.5308460 0.5887410
## 28: NA NA
## 29: NA NA
## 30: NA NA
## 31: -0.6822820 0.4950610
## 32: -0.5053250 0.6133300
## 33: 0.4001510 0.6706260
## 34: 1.3791600 0.1678460
## 35: -0.8240490 0.4099120
## 36: 0.9870450 0.3736010
## 37: 0.2770160 0.7817680
## 38: -0.2464250 0.8053530
## 39: 0.0385039 0.9622320
## 40: 0.4829150 0.6291560
## 41: 0.5200530 0.6030270
## 42: 1.1125600 0.3297530
## 43: -2.2660000 0.0234511
## 44: 0.7771890 0.4370470
## 45: 2.5674300 0.0786121
## 46: 1.0994000 0.2715940
## 47: 0.7181670 0.4726540
## 48: 1.1131300 0.3295430
## Z_OR_F_STAT P
In trying the dominant/recessive model I found that there were a lot more sparsity within the results with only a single significant SNP prior to p-value correction, rs3024798, with the additive models for rs2243639 & rs17886395 being significant. However when considering the p-value correction to P = 0.003125, no SNP is significant any longer.
FrequencyTable <- as.data.frame(cbind("SNP ID"=Freq$SNP,
"Chromosome"=Freq$CHR,
"Allele 1"=Freq$A1,
"Allele 1 Frequency"=(1-Freq$MAF),
"Allele 2"=Freq$A2,
"Allele 2 Frequency"=Freq$MAF))
FrequencyTable %>%
kbl(caption = "Allele Frequencies of PossibleRSV Associated SNPs") %>%
kable_classic(full_width = F, html_font = "Cambria")
| SNP ID | Chromosome | Allele 1 | Allele 1 Frequency | Allele 2 | Allele 2 Frequency |
|---|---|---|---|---|---|
| rs1130866 | 2 | C | 0.5295 | T | 0.4705 |
| rs3024798 | 2 | A | 0.6875 | C | 0.3125 |
| rs2077079 | 2 | C | 0.5977 | A | 0.4023 |
| rs4715 | 8 | A | 0.749 | C | 0.251 |
| rs1124 | 8 | A | 0.6902 | G | 0.3098 |
| rs1965708 | 10 | A | 0.8018 | C | 0.1982 |
| rs1965707 | 10 | T | 0.7263 | C | 0.2737 |
| rs17886395 | 10 | C | 0.601 | G | 0.399 |
| rs1059046 | 10 | C | 0.5763 | A | 0.4237 |
| rs1059047 | 10 | C | 0.90152 | T | 0.09848 |
| rs1136450 | 10 | C | 0.553 | G | 0.447 |
| rs1136451 | 10 | G | 0.8092 | A | 0.1908 |
| rs1059057 | 10 | G | 0.9 | A | 0.1 |
| rs4253527 | 10 | T | 0.8992 | C | 0.1008 |
| rs2243639 | 10 | A | 0.6463 | G | 0.3537 |
| rs721917 | 10 | C | 0.5723 | T | 0.4277 |
casefreq <- PEDFile[1:231,]
contfreq <- PEDFile[232:402,]
write.table(casefreq, file="/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/CaseFreq.ped",
sep="\t",
quote=F,
col.names=F,
row.names=F)
write.table(MAPFile,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/CaseFreq.map",
sep="\t",
quote=F,
col.names=F,
row.names=F)
write.table(contfreq, file="/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/ControlFreq.ped",
sep="\t",
quote=F,
col.names=F,
row.names=F)
write.table(MAPFile,
"/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/ControlFreq.map",
sep="\t",
quote=F,
col.names=F,
row.names=F)
# plink --file CaseFreq --freq --mind 1 --nonfounders --out CaseFreq
# plink --file ControlFreq --freq --mind 1 --nonfounders --out ContFreq
CaFreq <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/CaseFreq.frq", sep=" ", header=T)
CoFreq <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/ContFreq.frq", sep=" ", header=T)
CaFrequencyTable <- as.data.frame(cbind("SNP ID"=CaFreq$SNP,
"Chromosome"=CaFreq$CHR,
"Allele 1"=CaFreq$A1,
"Allele 1 Frequency"=(1-CaFreq$MAF),
"Allele 2"=CaFreq$A2,
"Allele 2 Frequency"=CaFreq$MAF))
CoFrequencyTable <- as.data.frame(cbind("SNP ID"=CoFreq$SNP,
"Chromosome"=CoFreq$CHR,
"Allele 1"=CoFreq$A1,
"Allele 1 Frequency"=(1-CoFreq$MAF),
"Allele 2"=CoFreq$A2,
"Allele 2 Frequency"=CoFreq$MAF))
CaFrequencyTable %>%
kbl(caption = "Allele Frequencies of Possible Case Associated SNPs") %>%
kable_classic(full_width = F, html_font = "Cambria")
| SNP ID | Chromosome | Allele 1 | Allele 1 Frequency | Allele 2 | Allele 2 Frequency |
|---|---|---|---|---|---|
| rs1130866 | 2 | C | 0.5333 | T | 0.4667 |
| rs3024798 | 2 | A | 0.7593 | C | 0.2407 |
| rs2077079 | 2 | C | 0.6389 | A | 0.3611 |
| rs4715 | 8 | A | 0.7472 | C | 0.2528 |
| rs1124 | 8 | A | 0.6878 | G | 0.3122 |
| rs1965708 | 10 | A | 0.8111 | C | 0.1889 |
| rs1965707 | 10 | T | 0.7222 | C | 0.2778 |
| rs17886395 | 10 | C | 0.64 | G | 0.36 |
| rs1059046 | 10 | C | 0.5819 | A | 0.4181 |
| rs1059047 | 10 | C | 0.8929 | T | 0.1071 |
| rs1136450 | 10 | C | 0.5625 | G | 0.4375 |
| rs1136451 | 10 | G | 0.7928 | A | 0.2072 |
| rs1059057 | 10 | G | 0.8973 | A | 0.1027 |
| rs4253527 | 10 | T | 0.8851 | C | 0.1149 |
| rs2243639 | 10 | A | 0.68 | G | 0.32 |
| rs721917 | 10 | C | 0.5498 | T | 0.4502 |
CoFrequencyTable %>%
kbl(caption = "Allele Frequencies of Possible Control Associated SNPs") %>%
kable_classic(full_width = F, html_font = "Cambria")
| SNP ID | Chromosome | Allele 1 | Allele 1 Frequency | Allele 2 | Allele 2 Frequency |
|---|---|---|---|---|---|
| rs1130866 | 2 | C | 0.5203 | T | 0.4797 |
| rs3024798 | 2 | A | 0.6351 | C | 0.3649 |
| rs2077079 | 2 | C | 0.5676 | A | 0.4324 |
| rs4715 | 8 | A | 0.7533 | C | 0.2467 |
| rs1124 | 8 | A | 0.6959 | G | 0.3041 |
| rs1965708 | 10 | A | 0.7892 | C | 0.2108 |
| rs1965707 | 10 | T | 0.7319 | C | 0.2681 |
| rs17886395 | 10 | C | 0.5482 | G | 0.4518 |
| rs1059046 | 10 | C | 0.5659 | A | 0.4341 |
| rs1059047 | 10 | C | 0.91667 | T | 0.08333 |
| rs1136450 | 10 | C | 0.5365 | G | 0.4635 |
| rs1136451 | 10 | G | 0.8304 | A | 0.1696 |
| rs1059057 | 10 | G | 0.90361 | A | 0.09639 |
| rs4253527 | 10 | T | 0.91765 | C | 0.08235 |
| rs2243639 | 10 | A | 0.5842 | G | 0.4158 |
| rs721917 | 10 | C | 0.6029 | T | 0.3971 |
plink --bfile RSVData --covar RSVCovariates.txt --recodeAD --out RSVrecode
recode <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVrecode.raw", sep=" ", header=T)
recodecov <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVrecode.cov", sep=" ", header=T)
Rrs1130866 <- c()
Rrs3024798 <- c()
Rrs2077079 <- c()
Rrs4715 <- c()
Rrs1124 <- c()
Rrs1965708 <- c()
Rrs1965707 <- c()
Rrs17886395 <- c()
Rrs1059046 <- c()
Rrs1059047 <- c()
Rrs1136450 <- c()
Rrs1136451 <- c()
Rrs1059057 <- c()
Rrs4253527 <- c()
Rrs2243639 <- c()
Rrs721917 <- c()
recode <- dplyr::select(recode,-ends_with("HET"))
recode <- dplyr::select(recode,-c(PAT,MAT,SEX))
recode <- merge(recode, recodecov, by='FID')
recode$PHENOTYPE <- ifelse(recode$PHENOTYPE==2,1,0)
vec <- colnames(dplyr::select(recode, starts_with("rs")))
t <- 1000
N <- nrow(recode)
for (i in 1:t){
recode$PHENOTYPE <- recode$PHENOTYPE[sample(nrow(recode), N, replace=F)]
rs1130866 <- glm(
PHENOTYPE ~ rs1130866_C + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs3024798 <- glm(
PHENOTYPE ~ rs3024798_A + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs2077079 <- glm(
PHENOTYPE ~ rs2077079_C + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs4715 <- glm(
PHENOTYPE ~ rs4715_A + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1124 <- glm(
PHENOTYPE ~ rs1124_A + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1965708 <- glm(
PHENOTYPE ~ rs1965708_A + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1965707 <- glm(
PHENOTYPE ~ rs1965707_T + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs17886395 <- glm(
PHENOTYPE ~ rs17886395_C + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1059046 <- glm(
PHENOTYPE ~ rs1059046_C + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1059047 <- glm(
PHENOTYPE ~ rs1059047_C + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1136450 <- glm(
PHENOTYPE ~ rs1136450_C + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1136451 <- glm(
PHENOTYPE ~ rs1136451_G + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs1059057 <- glm(
PHENOTYPE ~ rs1059057_G + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs4253527 <- glm(
PHENOTYPE ~ rs4253527_T + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs2243639 <- glm(
PHENOTYPE ~ rs2243639_A + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
rs721917 <- glm(
PHENOTYPE ~ rs721917_C + Sex + Smoking + Pets + Infection + Ancestry_6 + Ancestry_3 + Ancestry_5 + Ancestry_4 + Ancestry_1 + Age,
data=recode, family="binomial")
Rrs1130866 <- append(Rrs1130866, summary(rs1130866)$coefficients[2,4])
Rrs3024798 <- append(Rrs3024798, summary(rs3024798)$coefficients[2,4])
Rrs2077079 <- append(Rrs2077079, summary(rs2077079)$coefficients[2,4])
Rrs4715 <- append(Rrs4715, summary(rs4715)$coefficients[2,4])
Rrs1124 <- append(Rrs1124, summary(rs1124)$coefficients[2,4])
Rrs1965708 <- append(Rrs1965708, summary(rs1965708)$coefficients[2,4])
Rrs1965707 <- append(Rrs1965707, summary(rs1965707)$coefficients[2,4])
Rrs17886395 <- append(Rrs17886395, summary(rs17886395)$coefficients[2,4])
Rrs1059046 <- append(Rrs1059046, summary(rs1059046)$coefficients[2,4])
Rrs1059047 <- append(Rrs1059047, summary(rs1059047)$coefficients[2,4])
Rrs1136450 <- append(Rrs1136450, summary(rs1136450)$coefficients[2,4])
Rrs1136451 <- append(Rrs1136451, summary(rs1136451)$coefficients[2,4])
Rrs1059057 <- append(Rrs1059057, summary(rs1059057)$coefficients[2,4])
Rrs4253527 <- append(Rrs4253527, summary(rs4253527)$coefficients[2,4])
Rrs2243639 <- append(Rrs2243639, summary(rs2243639)$coefficients[2,4])
Rrs721917 <- append(Rrs721917, summary(rs721917)$coefficients[2,4])
Permutation_Results <- as.data.frame(cbind("rs1130866"=Rrs1130866,
"rs3024798"=Rrs3024798,
"rs2077079"=Rrs2077079,
"rs4715"=Rrs4715,
"rs1124"=Rrs1124,
"rs1965708"=Rrs1965708,
"rs1965707"=Rrs1965707,
"rs17886395"=Rrs17886395,
"rs1059046"=Rrs1059046,
"rs1059047"=Rrs1059047,
"rs1136450"=Rrs1136450,
"rs1136451"=Rrs1136451,
"rs1059057"=Rrs1059057,
"rs4253527"=Rrs4253527,
"rs2243639"=Rrs2243639,
"rs721917"=Rrs721917))
}
distrib <- data.frame(rowMin(as.matrix(Permutation_Results)))
colnames(distrib) <- "P_Value"
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1130866
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1130866 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[1]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[1]), col="red")
#rs4715
hist(as.numeric(distrib$P_Value), xlab=paste0("rs4715 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[4]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[4]), col="red")
#rs1124
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1124 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[5]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[5]), col="red")
#rs1965708
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1965708 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[6]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[6]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1965707
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1965707 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[7]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[7]), col="red")
#rs17886395
hist(as.numeric(distrib$P_Value), xlab=paste0("rs17886395 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[8]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[8]), col="red")
#rs1059046
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059046 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[9]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[9]), col="red")
#rs1059047
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059047 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[10]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[10]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1136450
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1136450 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[11]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[11]), col="red")
#rs1136451
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1136451 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[12]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[12]), col="red")
#rs1059057
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059057 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[13]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[13]), col="red")
#rs4253527
hist(as.numeric(distrib$P_Value), xlab=paste0("rs4253527 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[14]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[14]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs3024798
hist(as.numeric(distrib$P_Value), xlab=paste0("rs3024798 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[2]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[2]), col="red")
#rs2077079
hist(as.numeric(distrib$P_Value), xlab=paste0("rs2077079 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[3]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[3]), col="red")
#rs2243639
hist(as.numeric(distrib$P_Value), xlab=paste0("rs2243639 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[15]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[15]), col="red")
#rs721917
hist(as.numeric(distrib$P_Value), xlab=paste0("rs721917 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < UpCovAssoc$P[16]))/nrow(distrib),")"), main=NULL)
abline(v=summary(UpCovAssoc$P[16]), col="red")
recode <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVrecode.raw", sep=" ", header=T)
recodecov <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RSVrecode.cov", sep=" ", header=T)
Rrs1130866 <- c()
Rrs3024798 <- c()
Rrs2077079 <- c()
Rrs4715 <- c()
Rrs1124 <- c()
Rrs1965708 <- c()
Rrs1965707 <- c()
Rrs17886395 <- c()
Rrs1059046 <- c()
Rrs1059047 <- c()
Rrs1136450 <- c()
Rrs1136451 <- c()
Rrs1059057 <- c()
Rrs4253527 <- c()
Rrs2243639 <- c()
Rrs721917 <- c()
recode <- dplyr::select(recode,-ends_with("HET"))
recode <- dplyr::select(recode,-c(PAT,MAT,SEX))
recode <- merge(recode, recodecov, by='FID')
recode$PHENOTYPE <- ifelse(recode$PHENOTYPE==2,1,0)
vec <- colnames(dplyr::select(recode, starts_with("rs")))
t <- 1000
N <- nrow(recode)
for (i in 1:t){
recode$PHENOTYPE <- recode$PHENOTYPE[sample(nrow(recode), N, replace=F)]
rs1130866 <- glm(PHENOTYPE ~ rs1130866_C, data=recode, family="binomial")
rs3024798 <- glm(PHENOTYPE ~ rs3024798_A, data=recode, family="binomial")
rs2077079 <- glm(PHENOTYPE ~ rs2077079_C, data=recode, family="binomial")
rs4715 <- glm(PHENOTYPE ~ rs4715_A, data=recode, family="binomial")
rs1124 <- glm(PHENOTYPE ~ rs1124_A, data=recode, family="binomial")
rs1965708 <- glm(PHENOTYPE ~ rs1965708_A, data=recode, family="binomial")
rs1965707 <- glm(PHENOTYPE ~ rs1965707_T, data=recode, family="binomial")
rs17886395 <- glm(PHENOTYPE ~ rs17886395_C, data=recode, family="binomial")
rs1059046 <- glm(PHENOTYPE ~ rs1059046_C, data=recode, family="binomial")
rs1059047 <- glm(PHENOTYPE ~ rs1059047_C, data=recode, family="binomial")
rs1136450 <- glm(PHENOTYPE ~ rs1136450_C, data=recode, family="binomial")
rs1136451 <- glm(PHENOTYPE ~ rs1136451_G, data=recode, family="binomial")
rs1059057 <- glm(PHENOTYPE ~ rs1059057_G, data=recode, family="binomial")
rs4253527 <- glm(PHENOTYPE ~ rs4253527_T, data=recode, family="binomial")
rs2243639 <- glm(PHENOTYPE ~ rs2243639_A, data=recode, family="binomial")
rs721917 <- glm(PHENOTYPE ~ rs721917_C, data=recode, family="binomial")
Rrs1130866 <- append(Rrs1130866, summary(rs1130866)$coefficients[2,4])
Rrs3024798 <- append(Rrs3024798, summary(rs3024798)$coefficients[2,4])
Rrs2077079 <- append(Rrs2077079, summary(rs2077079)$coefficients[2,4])
Rrs4715 <- append(Rrs4715, summary(rs4715)$coefficients[2,4])
Rrs1124 <- append(Rrs1124, summary(rs1124)$coefficients[2,4])
Rrs1965708 <- append(Rrs1965708, summary(rs1965708)$coefficients[2,4])
Rrs1965707 <- append(Rrs1965707, summary(rs1965707)$coefficients[2,4])
Rrs17886395 <- append(Rrs17886395, summary(rs17886395)$coefficients[2,4])
Rrs1059046 <- append(Rrs1059046, summary(rs1059046)$coefficients[2,4])
Rrs1059047 <- append(Rrs1059047, summary(rs1059047)$coefficients[2,4])
Rrs1136450 <- append(Rrs1136450, summary(rs1136450)$coefficients[2,4])
Rrs1136451 <- append(Rrs1136451, summary(rs1136451)$coefficients[2,4])
Rrs1059057 <- append(Rrs1059057, summary(rs1059057)$coefficients[2,4])
Rrs4253527 <- append(Rrs4253527, summary(rs4253527)$coefficients[2,4])
Rrs2243639 <- append(Rrs2243639, summary(rs2243639)$coefficients[2,4])
Rrs721917 <- append(Rrs721917, summary(rs721917)$coefficients[2,4])
Permutation_Results <- as.data.frame(cbind("rs1130866"=Rrs1130866,
"rs3024798"=Rrs3024798,
"rs2077079"=Rrs2077079,
"rs4715"=Rrs4715,
"rs1124"=Rrs1124,
"rs1965708"=Rrs1965708,
"rs1965707"=Rrs1965707,
"rs17886395"=Rrs17886395,
"rs1059046"=Rrs1059046,
"rs1059047"=Rrs1059047,
"rs1136450"=Rrs1136450,
"rs1136451"=Rrs1136451,
"rs1059057"=Rrs1059057,
"rs4253527"=Rrs4253527,
"rs2243639"=Rrs2243639,
"rs721917"=Rrs721917))
}
distrib <- data.frame(rowMin(as.matrix(Permutation_Results)))
colnames(distrib) <- "P_Value"
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1130866
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1130866 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[1]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[1]), col="red")
#rs4715
hist(as.numeric(distrib$P_Value), xlab=paste0("rs4715 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[4]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[4]), col="red")
#rs1124
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1124 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[5]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[5]), col="red")
#rs1965708
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1965708 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[6]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[6]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1965707
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1965707 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[7]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[7]), col="red")
#rs17886395
hist(as.numeric(distrib$P_Value), xlab=paste0("rs17886395 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[8]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[8]), col="red")
#rs1059046
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059046 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[9]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[9]), col="red")
#rs1059047
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059047 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[10]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[10]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1136450
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1136450 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[11]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[11]), col="red")
#rs1136451
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1136451 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[12]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[12]), col="red")
#rs1059057
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059057 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[13]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[13]), col="red")
#rs4253527
hist(as.numeric(distrib$P_Value), xlab=paste0("rs4253527 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[14]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[14]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs3024798
hist(as.numeric(distrib$P_Value), xlab=paste0("rs3024798 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[2]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[2]), col="red")
#rs2077079
hist(as.numeric(distrib$P_Value), xlab=paste0("rs2077079 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[3]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[3]), col="red")
#rs2243639
hist(as.numeric(distrib$P_Value), xlab=paste0("rs2243639 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[15]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[15]), col="red")
#rs721917
hist(as.numeric(distrib$P_Value), xlab=paste0("rs721917 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < Assoc2$P[16]))/nrow(distrib),")"), main=NULL)
abline(v=summary(Assoc2$P[16]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1130866
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1130866 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[1]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[1]), col="red")
#rs4715
hist(as.numeric(distrib$P_Value), xlab=paste0("rs4715 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[4]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[4]), col="red")
#rs1124
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1124 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[5]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[5]), col="red")
#rs1965708
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1965708 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[6]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[6]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1965707
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1965707 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[7]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[7]), col="red")
#rs17886395
hist(as.numeric(distrib$P_Value), xlab=paste0("rs17886395 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[8]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[8]), col="red")
#rs1059046
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059046 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[9]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[9]), col="red")
#rs1059047
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059047 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[10]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[10]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs1136450
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1136450 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[11]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[11]), col="red")
#rs1136451
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1136451 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[12]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[12]), col="red")
#rs1059057
hist(as.numeric(distrib$P_Value), xlab=paste0("rs1059057 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[13]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[13]), col="red")
#rs4253527
hist(as.numeric(distrib$P_Value), xlab=paste0("rs4253527 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[14]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[14]), col="red")
par(mfrow=c(2,2), oma=c(0, 0, 2, 0))
#rs3024798
hist(as.numeric(distrib$P_Value), xlab=paste0("rs3024798 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[2]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[2]), col="red")
#rs2077079
hist(as.numeric(distrib$P_Value), xlab=paste0("rs2077079 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[3]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[3]), col="red")
#rs2243639
hist(as.numeric(distrib$P_Value), xlab=paste0("rs2243639 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[15]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[15]), col="red")
#rs721917
hist(as.numeric(distrib$P_Value), xlab=paste0("rs721917 Min P Estimates (P-Value: ",nrow(filter(distrib, distrib$P_Value < NoCov$P[16]))/nrow(distrib),")"), main=NULL)
abline(v=summary(NoCov$P[16]), col="red")
recode2 <- recode[,3:30]
recode2 <- subset(recode2, select=-c(IID.y))
write.table(recode2, "/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RecodedSNPs.txt", quote=F, sep="\t", row.names=F, col.names=T)
FullSNP <- fread("/Users/keenananderson-fears/Desktop/Dr._Liu_Lab/Consultations/Gandhi/RecodedSNPs2.txt", sep="\t", header=T)
fullSNPmodel <- glm(PHENOTYPE ~ ., data=FullSNP, family=binomial)
summary(fullSNPmodel)
##
## Call:
## glm(formula = PHENOTYPE ~ ., family = binomial, data = FullSNP)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1650 -1.1503 0.7798 1.0042 2.0117
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.111e+00 6.866e-01 1.618 0.10557
## V1 -3.323e-04 9.457e-04 -0.351 0.72529
## rs1130866_C 1.699e-02 1.985e-01 0.086 0.93179
## rs3024798_A -1.013e+00 5.297e-01 -1.912 0.05583 .
## rs2077079_C 5.046e-01 4.996e-01 1.010 0.31252
## rs4715_A 1.717e-02 3.374e-01 0.051 0.95942
## rs1124_A -4.358e-02 3.270e-01 -0.133 0.89397
## rs1965708_A -5.409e-01 4.096e-01 -1.321 0.18665
## rs1965707_T 3.951e-01 3.608e-01 1.095 0.27344
## rs17886395_C -4.705e-01 1.549e-01 -3.038 0.00238 **
## rs1059046_C 3.724e-02 3.543e-01 0.105 0.91628
## rs1059047_C 1.322e+00 6.173e-01 2.142 0.03222 *
## rs1136450_C -3.916e-01 3.242e-01 -1.208 0.22704
## rs1136451_G -9.724e-02 5.615e-01 -0.173 0.86250
## rs1059057_G -5.101e-01 7.099e-01 -0.719 0.47244
## rs4253527_T 3.403e-01 5.184e-01 0.656 0.51154
## rs2243639_A -4.522e-01 2.261e-01 -2.000 0.04545 *
## rs721917_C 1.516e-01 1.910e-01 0.794 0.42724
## Sex 2.299e-01 2.178e-01 1.056 0.29103
## Smoking -1.011e-01 3.222e-01 -0.314 0.75365
## Pets 6.799e-02 3.311e-01 0.205 0.83730
## Infection -1.643e-01 2.242e-01 -0.733 0.46356
## Ancestry_6 -1.788e-02 3.824e-01 -0.047 0.96271
## Ancestry_3 7.565e-02 7.085e-01 0.107 0.91496
## Ancestry_5 -2.366e-01 7.960e-01 -0.297 0.76628
## Ancestry_4 -1.437e+01 5.354e+02 -0.027 0.97860
## Ancestry_1 -1.507e+00 1.220e+00 -1.235 0.21685
## Age -7.812e-03 2.029e-02 -0.385 0.70020
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 548.30 on 401 degrees of freedom
## Residual deviance: 513.25 on 374 degrees of freedom
## AIC: 569.25
##
## Number of Fisher Scoring iterations: 12
In the full SNP model which performs a logistic regression on all non-multi-allelic SNPs three SNPs are found to be significant, specifically rs17886395 (0.00238), rs1059047 (0.03222) & rs2243639 (0.04545).
FullSNP2 <- FullSNP[,1:18]
fullSNPmodel2 <- glm(PHENOTYPE ~ ., data=FullSNP2, family="binomial")
summary(fullSNPmodel2)
##
## Call:
## glm(formula = PHENOTYPE ~ ., family = "binomial", data = FullSNP2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1703 -1.1845 0.7749 0.9971 1.6724
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2443551 0.5162157 2.411 0.0159 *
## V1 -0.0001606 0.0009266 -0.173 0.8624
## rs1130866_C -0.0100724 0.1959957 -0.051 0.9590
## rs3024798_A -1.0105575 0.5206080 -1.941 0.0522 .
## rs2077079_C 0.5060630 0.4871903 1.039 0.2989
## rs4715_A 0.0339179 0.3321206 0.102 0.9187
## rs1124_A -0.0453215 0.3213087 -0.141 0.8878
## rs1965708_A -0.5480128 0.4054379 -1.352 0.1765
## rs1965707_T 0.4094321 0.3584099 1.142 0.2533
## rs17886395_C -0.4721751 0.1528249 -3.090 0.0020 **
## rs1059046_C 0.0516911 0.3514990 0.147 0.8831
## rs1059047_C 1.2753842 0.6108299 2.088 0.0368 *
## rs1136450_C -0.3737685 0.3200567 -1.168 0.2429
## rs1136451_G -0.1440679 0.5530747 -0.260 0.7945
## rs1059057_G -0.4163238 0.6991972 -0.595 0.5516
## rs4253527_T 0.3598993 0.5130949 0.701 0.4830
## rs2243639_A -0.4567987 0.2241474 -2.038 0.0416 *
## rs721917_C 0.1773194 0.1885183 0.941 0.3469
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 548.30 on 401 degrees of freedom
## Residual deviance: 519.56 on 384 degrees of freedom
## AIC: 555.56
##
## Number of Fisher Scoring iterations: 4
In the second full SNP model we simply removed the covariates and ran the logistic regression, with rs17886395 (0.0020), rs1059047 (0.0368) & rs2243639 (0.0416) being found to be significant again with similar p-values as in the original model with covariates.
SNP1odds <- exp(fullSNPmodel$coefficients)
SNP1Data <- as.data.frame(cbind("Name"=matrix(names(fullSNPmodel$coefficients), ncol=1),
"OR"=na.omit(unname(SNP1odds)),
"P"=matrix(coef(summary(fullSNPmodel))[,4], ncol=1),
"2.5%"=matrix(exp(na.omit(confint.default(fullSNPmodel, level=0.95)))[,1]),
"97.5%"=matrix(exp(na.omit(confint.default(fullSNPmodel, level=0.95)))[,2])))
SNP1Data = SNP1Data[-c(1,2,19,20,21,22,23,24,25,26,27,28),]
colnames(SNP1Data)[colnames(SNP1Data) == "V1"] <- "Variant"
colnames(SNP1Data)[colnames(SNP1Data) == "V2"] <- "Beta"
colnames(SNP1Data)[colnames(SNP1Data) == "V3"] <- "P"
colnames(SNP1Data)[colnames(SNP1Data) == "V4"] <- "2.5%"
colnames(SNP1Data)[colnames(SNP1Data) == "V5"] <- "97.5%"
rownames(SNP1Data) <- NULL
SNP2odds <- exp(fullSNPmodel2$coefficients)
SNP2Data <- as.data.frame(cbind("Name"=matrix(names(fullSNPmodel2$coefficients), ncol=1),
"OR"=na.omit(unname(SNP2odds)),
"P"=matrix(coef(summary(fullSNPmodel2))[,4], ncol=1),
"2.5%"=matrix(exp(na.omit(confint.default(fullSNPmodel2, level=0.95)))[,1]),
"97.5%"=matrix(exp(na.omit(confint.default(fullSNPmodel2, level=0.95)))[,2])))
SNP2Data = SNP2Data[-c(1,2),]
colnames(SNP2Data)[colnames(SNP2Data) == "V1"] <- "Variant"
colnames(SNP2Data)[colnames(SNP2Data) == "V2"] <- "Beta"
colnames(SNP2Data)[colnames(SNP2Data) == "V3"] <- "P"
colnames(SNP2Data)[colnames(SNP2Data) == "V4"] <- "2.5%"
colnames(SNP2Data)[colnames(SNP2Data) == "V5"] <- "97.5%"
rownames(SNP2Data) <- NULL
SNP1Data %>%
kbl(caption = "Odds Ratios & CIs of Non-Multiallelic SNPs with Covariates") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Variant | OR | P | 2.5% | 97.5% |
|---|---|---|---|---|
| rs1130866_C | 1.01713137380399 | 0.93179155114509 | 0.689364039980758 | 1.50074006123857 |
| rs3024798_A | 0.363159983221053 | 0.0558349614912301 | 0.128598683540082 | 1.02555617042541 |
| rs2077079_C | 1.65627854574515 | 0.312516484467051 | 0.622119244840757 | 4.40953827396524 |
| rs4715_A | 1.0173142421815 | 0.959419214169963 | 0.525155184987143 | 1.97070941491447 |
| rs1124_A | 0.957356091574371 | 0.893974786952101 | 0.504359614143277 | 1.81721664537199 |
| rs1965708_A | 0.58225176416397 | 0.186650485381822 | 0.260908600108923 | 1.29937118489204 |
| rs1965707_T | 1.48456342243797 | 0.273435569918901 | 0.731985305690184 | 3.01089180084381 |
| rs17886395_C | 0.624689514776487 | 0.00238264742216896 | 0.461138114190819 | 0.846247529455357 |
| rs1059046_C | 1.03794281125211 | 0.916276369990059 | 0.518364104439841 | 2.0783176732388 |
| rs1059047_C | 3.75110240747731 | 0.0322163391738038 | 1.11872860611497 | 12.5774644489032 |
| rs1136450_C | 0.675958522895938 | 0.227041975841574 | 0.358072296015071 | 1.27605494689382 |
| rs1136451_G | 0.907335104655631 | 0.862504976224902 | 0.301871019448445 | 2.72718127644262 |
| rs1059057_G | 0.600448390204921 | 0.4724419413366 | 0.14934789968379 | 2.41408329185102 |
| rs4253527_T | 1.40535329458134 | 0.511540625011172 | 0.508786711288969 | 3.88181892091303 |
| rs2243639_A | 0.63619550471594 | 0.0454498497726392 | 0.408467831006509 | 0.990885179925761 |
| rs721917_C | 1.16370342230608 | 0.42723896481475 | 0.80038203451961 | 1.69194909016129 |
SNP2Data %>%
kbl(caption = "Odds Ratios & CIs of Non-Multiallelic SNPs without Covariates") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Variant | OR | P | 2.5% | 97.5% |
|---|---|---|---|---|
| rs1130866_C | 0.989978133172503 | 0.959013928612707 | 0.67420772881068 | 1.45364204870294 |
| rs3024798_A | 0.364015993228228 | 0.0522449059258775 | 0.131213389480167 | 1.00986373304503 |
| rs2077079_C | 1.65874778631325 | 0.298926736870718 | 0.638385762856372 | 4.3100024760705 |
| rs4715_A | 1.03449970540474 | 0.918657189623131 | 0.539546354148601 | 1.98349897511817 |
| rs1124_A | 0.955690158670641 | 0.887828155473869 | 0.509118250796679 | 1.79397159294662 |
| rs1965708_A | 0.578097455162327 | 0.176485173522799 | 0.261151007818047 | 1.27970659756368 |
| rs1965707_T | 1.50596226549719 | 0.25330558107754 | 0.745993542089207 | 3.04013669977618 |
| rs17886395_C | 0.623644299123989 | 0.00200393917080188 | 0.462223659975525 | 0.841437264051878 |
| rs1059046_C | 1.05305042339325 | 0.883085395263228 | 0.52875278689298 | 2.09722808408225 |
| rs1059047_C | 3.58007656832181 | 0.0368020558030007 | 1.08131688979259 | 11.8530916848116 |
| rs1136450_C | 0.688136174363845 | 0.242879456595185 | 0.367486717988902 | 1.28856737206597 |
| rs1136451_G | 0.865828966282129 | 0.794489369138601 | 0.292855965801092 | 2.55982423578951 |
| rs1059057_G | 0.659466689231917 | 0.551555288078532 | 0.167507834352524 | 2.59627447210174 |
| rs4253527_T | 1.43318501780874 | 0.483035768181844 | 0.524270140977851 | 3.9178643503144 |
| rs2243639_A | 0.633307789893682 | 0.0415560637378452 | 0.408150362213138 | 0.982674018871935 |
| rs721917_C | 1.1940124486883 | 0.346912270507755 | 0.825166866510544 | 1.72773021492183 |
The odds ratios for the SNP model with covariates are, rs17886395 (0.6246), rs1059047 (3.751), and rs2243639 (0.6361). Odds ratios for the SNP model without covariates are rs17886395 (0.6236), rs1059047 (3.580), and rs2243639 (0.6333).